set more off
clear all


/*main categories, current*/

import delimited "cu_summaries.txt", clear 

keep if year<2019

drop footnote_codes
drop if period=="M13"

gen price_item = substr(series_id, 9, 10)
replace price_item  = subinstr(price_item, " ", "", .)


/*
SA0 - all items
SAF - food and beverages
SAH - housing
SAA - apparel
SAT - transportation
SAM - medical
SAR - recreation
SAE - education and communication
SAG - other goods and services
SAS - services
SAC - commodities
*/

keep if price_item=="SA0" || price_item=="SAF" || price_item=="SAH" || price_item=="SAA" || price_item=="SAT" || price_item=="SAM" /*
	*/  || price_item=="SAR" || price_item=="SAE" || price_item=="SAG" || price_item=="SAS" || price_item=="SAC"          

	


gen series_type = substr(series_id, 1, 4)

gen area = substr(series_id, 5, 4)
keep if area=="0000" || area=="0100" || area=="0200" || area=="0300" || area=="0400" /*keep 4 regions and total for US*/

sort area price_item series_type year period
by area price_item series_type year: egen price=mean(value)
by area price_item series_type year: egen number=count(value)
drop if number==1 & period=="M12" /*drop observations with only 1 december observation per year*/
drop value number

quietly by area price_item series_type year:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup period

keep if series_type=="CUUR" /*not seasonally adjusted*/
drop series_type series_id

reshape wide price, i(year area) j(price_item) string

sort area year

replace area="USA" if area=="0000"
replace area="Northeast" if area=="0100"
replace area="Midwest" if area=="0200"
replace area="South" if area=="0300"
replace area="West" if area=="0400"


save cu_summary_processed, replace




/*main categories, historical*/

import delimited "mu_summaries.txt", clear 

keep if year<1998

drop footnote_codes
drop if period=="M13"

gen price_item = substr(series_id, 9, 10)
replace price_item  = subinstr(price_item, " ", "", .)


/*
SA0 - all items
SA1 - food and beverages
SA2 - housing
SA3 - apparel
SA4 - transportation
SA5 - medical
SA6 - entertainment
SA7 - other goods and services
SAS - services
SAC - commodities
*/

keep if price_item=="SA0" || price_item=="SA1" || price_item=="SA2" || price_item=="SA3" || price_item=="SA4" || price_item=="SA5" /*
	*/  || price_item=="SA6" || price_item=="SA7" || price_item=="SAS" || price_item=="SAC"         
	
replace price_item="SA0_mu" if price_item=="SA0"
replace price_item="SAS_mu" if price_item=="SAS"
replace price_item="SAC_mu" if price_item=="SAC"
gen series_type = substr(series_id, 1, 4)

gen area = substr(series_id, 5, 4)
keep if area=="0000" || area=="0100" || area=="0200" || area=="0300" || area=="0400" /*keep 4 regions and total for US*/

sort area price_item series_type year period
by area price_item series_type year: egen price=mean(value)
by area price_item series_type year: egen number=count(value)
drop if number==1 & period=="M12" /*drop observations with only 1 december observation per year*/
drop value number

quietly by area price_item series_type year:  gen dup = cond(_N==1,0,_n)
drop if dup>1
drop dup period

keep if series_type=="MUUR" /*not seasonally adjusted*/
drop series_type series_id


reshape wide price, i(year area) j(price_item) string

sort area year

replace area="USA" if area=="0000"
replace area="Northeast" if area=="0100"
replace area="Midwest" if area=="0200"
replace area="South" if area=="0300"
replace area="West" if area=="0400"



merge 1:1 area year using cu_summary_processed
drop _merge

gen price_total=priceSA0
gen price_food=priceSAF
gen price_housing=priceSAH
gen price_apparel=priceSAA
gen price_transport=priceSAT
gen price_med=priceSAM
gen price_entert=priceSA6
gen price_rec=priceSAR
gen price_educ_comm=priceSAE
gen price_oth=priceSAG
gen price_services=priceSAS
gen price_commodities=priceSAC

/*initial values for two new prices*/
replace price_rec=100 if year==1997 & area!="USA"
replace price_educ_comm=100 if year==1997 & area!="USA"

keep year area price_total price_food price_housing price_apparel price_transport price_med price_entert price_rec price_educ_comm price_oth price_services price_commodities


save major_cat_processed, replace



/*durables and nondurables; shelter*/

local list midwest northeast south west us

foreach name of local list {

		import delimited "other_`name'.txt", clear 
		
		keep if year<2019
		
		drop footnote_codes
		drop if period=="M13"
		gen price_item = substr(series_id, 9, 10)
		replace price_item  = subinstr(price_item, " ", "", .)
		/*
		SAD - durables
		SAN - nondurable
		*/
		keep if price_item=="SAD" || price_item=="SAN" || price_item=="SAH1"   
		gen series_type = substr(series_id, 1, 4)
		gen area = substr(series_id, 5, 4)
		
		keep if area=="0000" || area=="0100" || area=="0200" || area=="0300" || area=="0400"
		
		
		sort area price_item series_type year period
		by area price_item series_type year: egen price=mean(value)
		by area price_item series_type year: egen number=count(value)
		drop if number==1 & period=="M12" /*drop observations with only 1 december observation per year*/
		drop value number
		
		quietly by area price_item series_type year:  gen dup = cond(_N==1,0,_n)
		drop if dup>1
		drop dup period
		
		keep if series_type=="CUUR" /*not seasonally adjusted*/
		drop series_type series_id
		
		
		reshape wide price, i(year area) j(price_item) string
		
		sort area year
		
		replace area="USA" if area=="0000"
		replace area="Northeast" if area=="0100"
		replace area="Midwest" if area=="0200"
		replace area="South" if area=="0300"
		replace area="West" if area=="0400"
		
		rename priceSAD price_durables
		rename priceSAN price_nondurables
		rename priceSAH1 price_shelter
		
		save `name'_dur_nondur_processed, replace	
}

local list midwest northeast south west

foreach name of local list {
	
	append using `name'_dur_nondur_processed
	erase `name'_dur_nondur_processed.dta

}

merge 1:1 area year using major_cat_processed
drop _merge
sort area year


rename area region

*construct recreation price by smoothly pasting entert and rec
gen price_rec_sp=.
replace price_rec_sp=price_entert
by region: replace price_rec_sp=price_rec_sp[_n-1]*price_rec/price_rec[_n-1] if price_rec[_n-1]!=.
label variable price_rec_sp "Smoothly pasted entertainment and recreation index"

save prices_by_region, replace




erase cu_summary_processed.dta
erase major_cat_processed.dta
erase us_dur_nondur_processed.dta
